Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
-- Attaching packages ------------------------------------------------------------------------- tidyverse 1.3.1 --
v tibble 3.1.1 v dplyr 1.0.5
v tidyr 1.1.3 v stringr 1.4.0
v readr 1.4.0 v forcats 0.5.1
v purrr 0.3.4
-- Conflicts ---------------------------------------------------------------------------- tidyverse_conflicts() --
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
Loading required package: gsubfn
Loading required package: proto
Loading required package: RSQLite
Attaching package: 㤼㸱lubridate㤼㸲
The following objects are masked from 㤼㸱package:base㤼㸲:
date, intersect, setdiff, union
Loading required package: Matrix
Attaching package: 㤼㸱Matrix㤼㸲
The following objects are masked from 㤼㸱package:tidyr㤼㸲:
expand, pack, unpack
Attaching package: 㤼㸱arules㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
recode
The following objects are masked from 㤼㸱package:base㤼㸲:
abbreviate, write
Loading required package: NLP
Attaching package: 㤼㸱NLP㤼㸲
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
annotate
Attaching package: 㤼㸱tm㤼㸲
The following object is masked from 㤼㸱package:arules㤼㸲:
inspect
Attaching package: 㤼㸱tau㤼㸲
The following object is masked from 㤼㸱package:readr㤼㸲:
tokenize
df_artist <- read.csv("data/df_artist_sin_duplicados.csv")
df_charts_raw <- read.csv("data/df_charts_sin_duplicados.csv")
df_audio_features_raw <- read.csv("data/audio_features_plano_sin_duplicados.csv")
df_lyrics <- read.csv("data/df_lyrics.csv")
# DF listo para el join con chrats
df_audio_features <- df_audio_features_raw %>%
group_by(track_name, external_urls_spotify) %>%
mutate(artist_all = paste(artist_name, collapse = ",|,")) %>%
ungroup() %>%
mutate(artist_key = sub(",|,.*", "", artist_all)) %>%
dplyr::select(artist_name, artist_all, artist_key, everything(.)) %>%
distinct(artist_key, external_urls_spotify, .keep_all = T) %>%
as.data.frame()
cant_marketscontar_market <- function(x){
q <- length(unlist(strsplit(x, split = ",")))
return (q)
}
df_audio_features$cant_markets <- sapply(df_audio_features[,"markets_concat"], contar_market)
#features var continuos
features_continuas <- c('acousticness', 'danceability', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'cant_markets')
#features var_ categóricas
features_categoricas <- c('explicit', 'key_name', 'mode_name', "key_mode")
fit <- lm(loudness~energy+acousticness, data=df_audio_features)
modelo <- fit$coefficients
df_audio_features$loudness_reg_imp <- df_audio_features$loudness
X <- df_audio_features[df_audio_features$loudness>0, c('energy', "acousticness")]
df_audio_features$loudness_reg_imp[df_audio_features$loudness>0] <- modelo[1]+modelo[2]*X[,1]+modelo[3]*X[,2]
summary(df_audio_features[,c("loudness", "loudness_reg_imp")])
loudness loudness_reg_imp
Min. :-60.000 Min. :-60.000
1st Qu.: -9.139 1st Qu.: -9.139
Median : -6.441 Median : -6.441
Mean : -7.513 Mean : -7.515
3rd Qu.: -4.780 3rd Qu.: -4.781
Max. : 3.515 Max. : -0.045
summary(fit)
Call:
lm(formula = loudness ~ energy + acousticness, data = df_audio_features)
Residuals:
Min 1Q Median 3Q Max
-44.943 -1.179 0.314 1.554 11.188
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -15.05720 0.03312 -454.57 <2e-16 ***
energy 12.41072 0.04081 304.12 <2e-16 ***
acousticness -1.48894 0.03016 -49.37 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2.493 on 149537 degrees of freedom
Multiple R-squared: 0.6179, Adjusted R-squared: 0.6179
F-statistic: 1.209e+05 on 2 and 149537 DF, p-value: < 2.2e-16
#graficos con loudness con imputacion
par(mfrow = c(2,1))
hist(df_audio_features[,'loudness_reg_imp'], main='loudness', xlab="")
#hist(sqrt(df_audio_features[,'loudness_reg_imp']), main= 'loudness_sqrt', xlab="")
boxplot(df_audio_features[,'loudness_reg_imp'], horizontal = T)
#boxplot(sqrt(df_audio_features[,'loudness_reg_imp']), horizontal = T)
#metrica de popularidad
df_charts <- df_charts_raw %>%
group_by(Artist, Track_Name) %>%
dplyr:: summarise(semanas_sum = as.double(n()),
streams_sum = (sum(Streams, na.rm = T)/10^6 ),
streams_min = (min(Streams)/10^6 ),
streams_max = (max(Streams)/10^6 ),
streams_avg = (mean(Streams)/10^6),
position_avg = mean(Position, na.rm = T),
position_median = median(Position, na.rm = T),
position_min = min(Position),
position_max = max(Position)) %>%
ungroup() %>%
mutate(popularidad = as.numeric(streams_sum*semanas_sum/position_avg) )
`summarise()` has grouped output by 'Artist'. You can override using the `.groups` argument.
library(reshape2)
Attaching package: 㤼㸱reshape2㤼㸲
The following object is masked from 㤼㸱package:tidyr㤼㸲:
smiths
ggplot(melt(df_charts[,3:ncol(df_charts)]), aes(value))+
geom_histogram()+
facet_wrap(~variable , scales = "free")
No id variables; using all as measure variables
audio_features Y charts#Armamos un join para tener una tabla de charts con las caracteristicas de las canciones
# deberian quedar 22993 filas completas
join_audio_charts <- df_audio_features %>%
select("artist_name","artist_all","artist_key",
"track_name", "external_urls_spotify", "album_name", "album_release_year",
all_of(features_continuas), all_of(features_categoricas)) %>%
right_join( df_charts,# %>%
by = c(
"track_name" = "Track_Name",
"artist_key" ="Artist"))
#HAY CHARTS QUE NO TIENEN FEATURES. HAY QUE TENERLO EN CUENTA PARA EL ANÁLISIS
library(mice)
Attaching package: 㤼㸱mice㤼㸲
The following object is masked from 㤼㸱package:stats㤼㸲:
filter
The following objects are masked from 㤼㸱package:base㤼㸲:
cbind, rbind
md.pattern(join_audio_charts, rotate.names = TRUE)
artist_key track_name semanas_sum streams_sum streams_min streams_max streams_avg position_avg
8786 1 1 1 1 1 1 1 1
2 1 1 1 1 1 1 1 1
610 1 1 1 1 1 1 1 1
0 0 0 0 0 0 0 0
position_median position_min position_max popularidad artist_name artist_all external_urls_spotify
8786 1 1 1 1 1 1 1
2 1 1 1 1 1 1 1
610 1 1 1 1 0 0 0
0 0 0 0 610 610 610
album_name acousticness danceability duration_ms energy instrumentalness liveness loudness speechiness tempo
8786 1 1 1 1 1 1 1 1 1 1
2 1 1 1 1 1 1 1 1 1 1
610 0 0 0 0 0 0 0 0 0 0
610 610 610 610 610 610 610 610 610 610
valence cant_markets explicit key_name mode_name key_mode album_release_year
8786 1 1 1 1 1 1 1 0
2 1 1 1 1 1 1 0 1
610 0 0 0 0 0 0 0 20
610 610 610 610 610 610 612 12202
##histograma de las variables continuas de audio_features
for (i in features_continuas){
hist(df_audio_features[,i], main = paste("Histograma de", i, "(all data)"), xlab = i)
abline(v = mean(df_audio_features[,i], na.rm = TRUE) , col="red")
abline(v = median(df_audio_features[,i], na.rm = TRUE) , col="blue")
legend("topright", legend = c("Media", "Mediana"), col=c("red", "blue"), lty =1)
}
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
##histograma de las variables continuas de charts
for (i in c(features_continuas)){
hist(join_audio_charts[,i], main = paste("Histograma de", i, "(charts)"), xlab = i)
abline(v = mean(join_audio_charts[,i], na.rm = TRUE) , col="red")
abline(v = median(join_audio_charts[,i], na.rm = TRUE) , col="blue")
}
#divido features de charts según su distribución
audio_charts_continuas_media <- c('duration_ms', 'valence')
audio_charts_continuas_mediana <- c('danceability', 'acousticness', 'tempo', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets', "Streams")
##medidas resumen y barplots de las variables categoricas audio_features
for(i in features_categoricas){
barplot(sort(table(df_audio_features[,i]),decreasing = T), las=2,
main = paste("Barplot de", i, "(all data)"))
# pie(table(df_features_categoricos[,i]))
}
##medidas resumen y barplots de las variables categoricas join_audio_charts
for(i in features_categoricas){
barplot(table(join_audio_charts[,i]), las=2,
main = paste("Barplot de", i, "(charts)")
)
# pie(table(df_features_categoricos[,i]))
}
hist(df_audio_features$instrumentalness)
par(mfrow=c(4,3))
for (feature in features_continuas){
boxplot(df_audio_features[,feature], las=2, horizontal=T, main=feature)
}
Con excepción de valence el resto de las features poseían cierto sesgo. Se decidió transformar las variables que mayor sesgo poseían: duration_ms, instrumentalness, liveness, speechiness como método de corregir la distribución y achicar la cantidad de outliers. La variable loudness_reg_imp no fue modificada debido a que al ser negativa
# "danceability,tempo,valence,acousticness,duration_ms,energy,instrumentalness,liveness,speechiness,cant_markets"
#sesgos d las variables
sort(apply(df_audio_features[,features_continuas], MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))
cant_markets loudness energy danceability tempo valence
-2.22406959 -0.79744200 -0.60263786 -0.43255845 -0.12539338 0.04799836
duration_ms instrumentalness liveness speechiness acousticness
0.31418195 0.98200834 1.24673883 1.25181587 1.38512775
variables_sesgo <- unlist(strsplit("acousticness,duration_ms,instrumentalness,liveness,speechiness,cant_markets,energy", ","))
df_sesgadas <- df_audio_features[,variables_sesgo]
logaritmo_ajustado = function(x,delta){
if (x==0.0){
return(log(0.00+delta, base = 10))
}else{
return(log(x, base = 10))
}
}
delta <- 10^(-6)
df_sesgadas_log_adjust <- data.frame(apply(df_audio_features[,variables_sesgo], MARGIN = c(1,2),
function(x) logaritmo_ajustado(x,delta)))
ggplot(reshape::melt(df_sesgadas), aes(value))+
geom_histogram()+
facet_wrap(~variable)
Using as id variables
ggplot(reshape::melt(df_sesgadas_log_adjust), aes(value))+
geom_histogram()+
facet_wrap(~variable)
Using as id variables
####################################################
# names(df_sesgadas_log_adjust) <- paste(names(df_sesgadas), "_log", sep="")
# names(df_sesgadas_log_adjust) <- names(df_sesgadas)
# df_datos <- cbind(df_sesgadas, df_sesgadas_log_adjust)
a <- df_sesgadas
b <- df_sesgadas_log_adjust
names(b) <- paste(names(df_sesgadas), "_log", sep="")
merged <- cbind(a,b)
merged <- merged[, order(names(merged))]
round((
apply(
merged, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)}
)
),2)
acousticness acousticness_log cant_markets cant_markets_log duration_ms
1.39 -0.69 -2.22 -1.73 0.31
duration_ms_log energy energy_log instrumentalness instrumentalness_log
-0.10 -0.60 -0.95 0.98 1.84
liveness liveness_log speechiness speechiness_log
1.25 0.65 1.25 0.80
#histogramas de vbles transformadas con logaritmo
# transformacion <- c('instrumentalness','loudness','liveness','speechiness', 'duration_ms')
par(mfrow=c(3,5))
for (feature in variables_sesgo){
hist(df_audio_features[,feature], main=feature)
}
for (feature in variables_sesgo){
hist(unlist(lapply(df_audio_features[,feature], function(x) logaritmo_ajustado(x,delta))), main=paste(feature,"log", sep="_"))
}
inv_sqrt_ajustada = function(x, delta){
if (x==0.0){
return(1/sqrt(x+0.000001))
}else{
return(1/sqrt(x))
}
}
delta <- sqrt(10^(-6))
par(mfrow=c(3,5))
for (feature in variables_sesgo){
hist(df_audio_features[,feature], main=feature)
}
for (feature in variables_sesgo){
hist(unlist(lapply(df_audio_features[,feature], function(x) inv_sqrt_ajustada(x,delta))), main=paste(feature,"inv_sqt", sep="_"))
}
df_sesgadas_inv_sqrt <- data.frame(apply(df_audio_features[,variables_sesgo], MARGIN = c(1,2),
function(x) inv_sqrt_ajustada(x,delta)))
#nuevos sesgos con inversa raiz cuadrada
a <- df_sesgadas
b <- df_sesgadas_inv_sqrt
names(b) <- paste(names(df_sesgadas), "_invsqrt", sep="")
merged <- cbind(a,b)
merged <- merged[, order(names(merged))]
round((
apply(
merged, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)}
)
),2)
acousticness acousticness_invsqrt cant_markets cant_markets_invsqrt
1.39 0.73 -2.22 0.21
duration_ms duration_ms_invsqrt energy energy_invsqrt
0.31 0.31 -0.60 0.12
instrumentalness instrumentalness_invsqrt liveness liveness_invsqrt
0.98 -0.77 1.25 0.03
speechiness speechiness_invsqrt
1.25 0.07
b <- df_sesgadas_inv_sqrt[,c("cant_markets", "energy", "instrumentalness", "liveness" , "speechiness")]
names(b) <- paste(names(b), "_invsqrt", sep="")
par(mfrow=c(1,2))
ggplot(data=reshape::melt(b), aes(value))+
geom_histogram(bins = 5)+facet_wrap(~variable, scales = "free")
Using as id variables
ggplot(data=reshape::melt(a), aes(value))+
geom_histogram(bins = 5)+facet_wrap(~variable, scales = "free")
Using as id variables
df_ses_chart <- df_charts %>%
ungroup() %>%
select( "Track_Name", "Artist",
"streams_avg", "position_avg",
"semanas_sum", "popularidad")
df_ses_chart_log <- data.frame(apply(df_ses_chart[,3:6], MARGIN = c(1,2),
function(x) logaritmo_ajustado(x,delta)))
cat("originales\n")
originales
#sesgos d las variables
(apply(df_ses_chart[,3:6], MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))
streams_avg position_avg semanas_sum popularidad
0.7985375 0.1054805 1.1508751 0.4276327
cat("\ntransformadas\n")
transformadas
(apply(df_ses_chart_log, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))
streams_avg position_avg semanas_sum popularidad
0.5222497 -0.4514567 0.2030793 0.4806319
ggplot(reshape2::melt(df_ses_chart[,3:6]), aes(value))+
geom_histogram()+
facet_wrap(~variable, scales = "free")+
labs(title = "Histogramas originales")
No id variables; using all as measure variables
ggplot(reshape::melt(df_ses_chart_log), aes(value))+
geom_histogram()+
facet_wrap(~variable, scales = "free")+
labs(title = "Histogramas transformados")
Using as id variables
df_ses_chart_inv_sqrt <- data.frame(apply(df_ses_chart[,3:6], MARGIN = c(1,2),
function(x) inv_sqrt_ajustada(x,delta)))
cat("\ntransformadas\n")
transformadas
(apply(df_ses_chart_inv_sqrt, MARGIN = 2, function(x){ (3* (mean(x,na.rm = T)-median(x, na.rm = T)))/sd(x, na.rm = T)} ))
streams_avg position_avg semanas_sum popularidad
-0.2981080 0.5133950 0.7563671 1.2815297
charts_sin_sesgo <- cbind(df_ses_chart[,-c(3,5)],
"semanas_sum" =df_ses_chart_log[,c("semanas_sum")],
"streams_avg" = df_ses_chart_inv_sqrt[,c("streams_avg")])
#join entre variables transformadas y resto features
audio_sin_sesgo <- df_audio_features %>%
select("artist_name", "artist_key",
"track_name",
all_of(features_continuas), all_of(features_categoricas)) %>%
select(!variables_sesgo)
Note: Using an external vector in selections is ambiguous.
i Use `all_of(variables_sesgo)` instead of `variables_sesgo` to silence this message.
i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
audio_ft_to_discretize <- cbind(audio_sin_sesgo, df_sesgadas_log_adjust[,c("acousticness", "duration_ms")],
df_sesgadas_inv_sqrt[,c("instrumentalness", "cant_markets",
"liveness" , "speechiness")]) %>%
select(-c( "key_name", "explicit", "mode_name" ,"key_mode")) %>%
group_by(artist_name, artist_key, track_name) %>%
distinct(.keep_all = T)
#normalizacion de todas las vbles
scale_vble <- function(x){
(x - mean(x, na.rm = T))/sd(x, na.rm = T)
}
audio_ft_to_discretize_norm <- audio_ft_to_discretize %>%
# mutate_all(scale)
mutate_all(scale_vble)#
`mutate_all()` ignored the following grouping variables:
Columns `artist_name`, `artist_key`, `track_name`
Use `mutate_at(df, vars(-group_cols()), myoperation)` to silence the message.
ggplot(data= melt(audio_ft_to_discretize, id.vars = c("artist_name", "artist_key", "track_name")), aes(value))+
geom_histogram()+
facet_wrap(~variable, scales = "free")
ggplot(data= melt(audio_ft_to_discretize_norm, id.vars = c("artist_name", "artist_key", "track_name")), aes(value))+
geom_histogram()+
facet_wrap(~variable, scales = "free")
charts_sin_sesgo_norm <- charts_sin_sesgo %>%
rename(track_name = Track_Name, artist = Artist) %>%
ungroup() %>%
select(-c(track_name, artist)) %>%
mutate_all(function(x) {scale_vble(x)})
# mutate_at(., .vars = vars(-group_cols("Track_Name", "Artist")), function(x) {scale_vble(x)})
charts_sin_sesgo_norm <- cbind(charts_sin_sesgo[, c("Artist", "Track_Name")], charts_sin_sesgo_norm )
#funciones
cut_median <- function(x){
cut(x,
breaks = c(0,median(x)/2,
median(x),
quantile(x,probs = 0.75),
Inf),
include.lowest = T,
labels=c("Baja","Media","Alta", "Muy alta") )
}
cut_binary <- function(x){
cut(x,
breaks = c(min(x,na.rm = T),
median(x),
Inf),
include.lowest = T,
labels=c("Baja","Alta"))
}
cut_cuantile <- function(x){
cut(x,
breaks = quantile(x),
include.lowest = T,
labels=c("Baja","Media","Alta", "Muy alta") )
}
#select data
df_audio_ft_select <-df_audio_features %>%
select(artist_name, artist_key, track_name, features_continuas, -loudness,loudness = loudness_reg_imp ) %>%
group_by(track_name, artist_key, artist_name) %>%
summarise_all(function(x) mean(x, na.rm = T)) %>%
left_join(df_audio_features %>%
select(artist_name, artist_key, track_name, explicit, mode_name) %>%
distinct(artist_name, artist_key, track_name, .keep_all = T) )
Note: Using an external vector in selections is ambiguous.
i Use `all_of(features_continuas)` instead of `features_continuas` to silence this message.
i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
Joining, by = c("track_name", "artist_key", "artist_name")
#creacion columnas
#by median
df_audio_ft_select$acousticness_cat <- cut_median(df_audio_ft_select$acousticness)
df_audio_ft_select$duration_ms_cat <- cut_median(df_audio_ft_select$duration_ms)
df_audio_ft_select$liveness_cat <-cut_median(df_audio_ft_select$liveness)
df_audio_ft_select$speechiness_cat <- cut_median(df_audio_ft_select$speechiness)
#by binary
df_audio_ft_select$instrumentalness_cat <- cut_binary(df_audio_ft_select$instrumentalness)
df_audio_ft_select$loudness_cat <- cut_binary(df_audio_ft_select$loudness)
#by cunatile
df_audio_ft_select$tempo_cat <- cut_cuantile(df_audio_ft_select$tempo)
df_audio_ft_select$valence_cat <- cut_cuantile(df_audio_ft_select$valence)
df_audio_ft_select$danceability_cat <- cut_cuantile(df_audio_ft_select$danceability)
#cant markets
df_audio_ft_select$cant_markets_cat <- cut(df_audio_ft_select$cant_markets,
breaks = c(0, 70, 110, Inf),
labels = c("Baja","Media","Alta"))
#true categories vbles
df_audio_ft_select$explicit_cat <- ifelse(df_audio_ft_select$explicit ==TRUE, "Si", "No")
df_audio_ft_select$mode_name_cat <- df_audio_ft_select$mode_name
# filtro
df_audio_ft_cat <- df_audio_ft_select %>%
select( artist_name, artist_key, track_name, contains("_cat") )
x <- df_audio_ft_cat <- df_audio_ft_select %>%
select( contains("_cat") )
Adding missing grouping variables: `track_name`, `artist_key`
for(i in names(x) ){
barplot(table(x[,i]), las=2,
main = paste("Barplot de", i, "(charts)") )
# cat(table( x[,i] ))
}
NA
NA
df_charts_sel <- df_charts %>%
ungroup() %>%
select( "Track_Name", "Artist",
"streams_avg", "position_median",
"semanas_sum", "popularidad")
ggplot(reshape2::melt(df_charts_sel[,3:6]), aes(value))+
geom_histogram()+
facet_wrap(~variable, scales = "free")+
labs(title = "Histogramas originales")
No id variables; using all as measure variables
df_charts_sel$streams_avg_cat = cut_cuantile(df_charts_sel$streams_avg)
df_charts_sel$popularidad_cat = cut_cuantile(df_charts_sel$popularidad)
df_charts_sel$position_median_cat = cut(df_charts_sel$position_median,
breaks = quantile(df_charts_sel$position_median),
include.lowest = T,
labels=c( "Muy alta", "Alta", "Media", "Baja"))
df_charts_sel$semanas_sum_cat = cut(df_charts_sel$semanas_sum,
breaks = c(0, 2,4,13, Inf),
include.lowest = T,
labels=c("Baja","Media","Alta", "Muy alta"))
# filtro
df_charts_cat <- df_charts_sel %>%
select( Artist, Track_Name, contains("_cat") )
#prueba fallida
audio_ft_to_discretize_norm %>%
ungroup() %>%
group_by(artist_name, track_name, artist_key) %>%
mutate_all( function(x){ cut(x, quantile(x, probs = seq(0, 1, 0.25
), na.rm = T,
# ) , labels = c("Muy Alta","Alta" ,"Media", "Baja"
# ) , labels = c("Muy Alta","Alta" ,"Media", "Baja"
) , labels = c("Muy Alta" ,"Alta", "Media", "Baja"
)
) }
)
# Discretizaciones basadas en Quantile (FUNCIONA)
library(R.oo)
cols <- names(audio_ft_to_discretize_norm)[!names(audio_ft_to_discretize_norm) %in% c("artist_name","artist_key", "track_name")]
df_num = audio_ft_to_discretize_norm[,cols]
df_cat <- audio_ft_to_discretize_norm[, c("artist_name","artist_key", "track_name")]
for(i in seq_along(df_num) ){
breaks =unique(quantile(df_num[[i]], probs = seq(0, 1, 0.33) ,na.rm = T))
# breaks =unique(quantile(df_num[[i]], probs = seq(0, 1, 0.25)))
# breaks =quantile(df_num[[i]] , names = FALSE)
label = intToChar(65:(63+length(breaks)))
# label = c("Muy Alta", "Alta", "Media", "Baja")
x <- cut(df_num[[i]], breaks=breaks,
labels = label )
# labels = c("Muy Alta","Alta" ,"Media", "Baja") )
df_cat <- cbind(df_cat, x )
}
# seteo de nombres
names(df_cat) <- names(audio_ft_to_discretize_norm)
# audio_ft_to_discretize_norm %>% filter(track_name =="Juice WRLD Speaks From Heaven - Outro")
# df_cat %>% filter(track_name =="Baby")
# df_cat %>% filter(track_name =="goodbye")
# df_cat %>% filter(track_name =="DÁKITI")
# audio_ft_to_discretize_norm %>% filter(track_name =="Rule The World (feat. Ariana Grande)")
# df_charts %>% filter(Track_Name =="Rule The World (feat. Ariana Grande)")
audio_ft_to_discretize_norm %>%
group_by(artist_name, track_name) %>%
arrange(streams_avg)
# slice(which.max(danceability))
# filter(danceability == max(danceability, na.rm =T))
#analisis de missing values
mice::md.pattern(df_cat, plot = T, rotate.names = T)
sum(!complete.cases(df_cat))
summary(VIM::aggr(df_cat, sortVar= T,plot=F))
# JOIN FINAL
#falta transformar df_charts !!!
join_audio_charts <- audio_ft_to_discretize_norm %>%
right_join( charts_sin_sesgo_norm ,
by = c("track_name" = "Track_Name",
"artist_key" ="Artist")) %>%
select(-c("artist_key", "key_name", "explicit", "mode_name" ,"key_mode")) %>%
group_by(artist_name, track_name) %>%
summarise_all( function(x) mean(x, na.rm= T)) %>%
ungroup() %>%
filter(!is.na(artist_name)) %>%
group_by(artist_name, track_name)# %>%
# mutate_all(function(x) scales::rescale(x, to=c(0,1)))
# mutate_all(function(x) (x-min(x, na.rm = T)) / (max(x, na.rm = T)-min(x, na.rm = T)) )
################################
## FILTRAMOS OUTLIERS POR Z-SCORE para 'danceability', 'tempo', 'valence'
##############################
#z-score para variables que tienden a la normal
#filtro features numericos
#divido los features por su distribución
features_continuas_media <- c('danceability', 'tempo', 'valence')
df_audio_features_zscore_media <- df_audio_features[,features_continuas_media]
#normalizo z score con las variables que tienden a la normal
zscore_cols <- c()
for(col in names(df_audio_features_zscore_media)){
name_col <- paste('zscore_',col, sep = "")
zscore_cols <- append(zscore_cols, name_col)
media <- mean(df_audio_features_zscore_media[,col])
stdv <- sd(df_audio_features_zscore_media[,col])
df_audio_features_zscore_media[,name_col] <- (df_audio_features_zscore_media[,col] - media)/stdv
}
par(mfrow=c(1,length(zscore_cols)))
lapply(zscore_cols, function(col) boxplot(df_audio_features_zscore_media[,col],xlab=col))
Danceability
#variable: danceability
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_danceability> umbral_zscore) | (df_audio_features_zscore_media$zscore_danceability< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, danceability ) %>%
arrange(-danceability)
Tempo
#variable: Tempo
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_tempo> umbral_zscore) | (df_audio_features_zscore_media$zscore_tempo< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, tempo ) %>%
arrange(-tempo)
Valence
#variable: valence
umbral_zscore <- 3
conditions <- (df_audio_features_zscore_media$zscore_valence> umbral_zscore) | (df_audio_features_zscore_media$zscore_valence< -1*umbral_zscore)
df_audio_features[conditions,] %>%
select(album_name,artist_name, valence ) %>%
arrange(-valence)
################################
## FILTRAMOS OUTLIERS POR Z-SCORE MODIFICADO para 'acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets'
##############################
features_continuas_mediana <- c('acousticness', 'duration_ms', 'energy', 'instrumentalness', 'liveness', 'loudness', 'speechiness', 'cant_markets')
df_audio_features_zscore_mediana <- df_audio_features[,features_continuas_mediana]
zscoremodif_cols <- c()
for(col in names(df_audio_features_zscore_mediana)){
name_col <- paste('zscoremodif_',col, sep = "")
zscoremodif_cols <- append(zscoremodif_cols, name_col)
med = median(df_audio_features_zscore_mediana[,col], na.rm = T)
MAD = median(abs(df_audio_features_zscore_mediana[,col] - med), na.rm = T)
df_audio_features_zscore_mediana[, name_col] <- 0.6745 * (df_audio_features_zscore_mediana[,col] - med) / MAD
}
par(mfrow=c(4,2))
lapply(zscoremodif_cols, function(col) boxplot(df_audio_features_zscore_mediana[,col],xlab=col, horizontal = T))
Revisión Variable Instrumentalness
instrumentalness <- c("instrumentalness", "zscoremodif_instrumentalness")
x <- df_audio_features$instrumentalness
n_interv <- 10
intervalos <- round(seq(0,max(x),by=(max(x)-min(x))/n_interv),2)
labs <- c()
for (i in 1:n_interv){
lab <- paste(intervalos[i],intervalos[i+1], sep='\n')
labs <- append(labs, lab)
}
bins <- cut(x, n_interv, include.lowest = TRUE, labels = labs)
barplot(table(bins))
Hacemos K-means para poder discretizar la variable.
sse <- c()
for (k in 2:6){
clusters <- kmeans(df_audio_features$instrumentalness,centers = k, iter.max = 10, nstart = k)
sse <- append(sse, clusters$tot.withinss)
}
plot(2:6,sse, type = 'l', xlab='Cantidad de Clusters', ylab='Suma Error Cuadrático')
#k=3
clusters3 <- kmeans(df_audio_features$instrumentalness,centers = 3, iter.max = 10, nstart = 3)
df_audio_features$clusters <- factor(clusters3$cluster)
lev <- levels(df_audio_features$clusters)
labs <- c()
for (i in lev){
min <- min(df_audio_features$instrumentalness[df_audio_features$clusters==i])
max <- max(df_audio_features$instrumentalness[df_audio_features$clusters==i])
lab <- paste(min,max, sep=' - ')
labs <- append(labs, lab)
}
labs
# barplot(table(factor(clusters3$cluster)), labels = labs)
#prueba igal de transformacion y test de normalidad
join_audio_charts[1:5,"acousticness"]^2
library(nortest)
log10(df_chart_w_lyrics$acousticness)
for (i in features_continuas){
x <- log10(df_chart_w_lyrics[,i])
x <- shapiro.test(x)
z <- x$p.value
print(z)
}
|# LYRICS
library(textcat)
df_lyrics <- read.csv("data/df_lyrics.csv") %>%
select(-X)
df_lyrics_unicas <- df_lyrics %>%
distinct(artist_name, track_name, lyrics, .keep_all = T)
#join
join_ly_ft = merge(x = df_cat,
y = df_lyrics_unicas,
by.x = c("artist_key","track_name"),
by.y = c("artist_name","track_name")) %>%
distinct(lyrics, .keep_all = T)
#filtro de idioma
spa_lyrics = join_ly_ft[textcat(join_ly_ft$lyrics)=="spanish", ]
#c("artist_name", "track_name", "lyrics")]
en_lyrics = join_ly_ft[textcat(join_ly_ft$lyrics)%in% c("english", "scots"), ]
#c("artist_name", "track_name", "lyrics")]
# tabla contingencia de idiomas
# idiomas = textcat(df_lyrics_unicas$lyrics)
# sort(table(idiomas), decreasing = T)
#funciones
#funcion para corregir palabras
decontracted = function(txt){
txt = gsub("ain't", "aint", txt)
txt = gsub("wanna", "want to", txt)
txt = gsub("gonna", "going to", txt)
txt = gsub("won't", "will not", txt)
txt = gsub("c'mon", "come on", txt)
txt = gsub("let's", "let us", txt)
txt = gsub("\\'s", " is", txt)
txt = gsub("\\'t", " not", txt)
txt = gsub("\\'ll", " will", txt)
txt = gsub("\\'m", " am", txt)
txt = gsub("\\'re", " are", txt)
txt = gsub("\\'d", " had", txt)
txt = gsub("\\'ve", " have", txt)
txt = gsub("couldn", "could", txt)
txt = gsub("don", "do", txt)
txt = gsub("doesn", "does", txt)
txt = gsub("isn", "is", txt)
txt = gsub("mustn", "must", txt)
txt = gsub("shouldn", "should", txt)
txt = gsub("wasn", "was", txt)
txt = gsub("\\'n", " and ", txt)
txt = gsub("\\^'n'", " and ", txt)
txt = gsub("\\^n'", " and ", txt)
txt = gsub("\\'cause", " because", txt)
txt = gsub("\\b'u\\b", " you", txt)
txt = gsub("\\bu'\\b", " you ", txt)
txt = gsub("\\bu\\b", "you", txt)
txt = gsub("\\in'", "ing", txt)
return(txt)
}
#Función para limpiar.
text_cleaning = function(txt, language){
txt = sub('^.+?\\[.*?\\]',"", txt) #ok
txt = sub("More on Genius.*","", txt)
txt = gsub('\\[.*?\\]', '', txt)
txt = gsub("\\n"," ", txt)
txt = gsub("[()]", " ", txt)
txt = gsub("([a-z]+)([A-Z]+)", "\\1 \\2", txt)
txt = tolower(txt)
txt = gsub("\\d", " ", txt)
if(language == "en"){
txt = decontracted(txt)
}else if (language == "es"){
txt = gsub("ñi","ni", txt)
txt = gsub('ñ', 'ni', txt)
txt = stri_trans_general(txt,"Latin-ASCII")
}
txt = gsub("\\W+\\b", " ", txt)
return(txt)
}
#limpio las letras en ingles
en_lyrics$lyrics_cleaning = text_cleaning(en_lyrics$lyrics, language = "en")
spa_lyrics$lyrics_cleaning = text_cleaning(spa_lyrics$lyrics, language = "es")
#write.csv(en_lyrics, "data/en_lyrics.csv", row.names = FALSE)
#El word_count_df se realiza con Python (ejecutar en_lyrics_word_coun)
word_counts_df <- read.csv("data/en_lyrics_word_count.csv")
x <- 1:nrow(word_counts_df)
f <- word_counts_df$counts
plot(x,f, type="l",
log = 'xy',
col="blue",
lwd=3,
ylab = "Frecuencia Absoluta",
xlab = "",
main="Frecuencia de Palabras - Ley de Zipf")
library(stopwords)
Attaching package: 㤼㸱stopwords㤼㸲
The following object is masked from 㤼㸱package:tm㤼㸲:
stopwords
threshold <- 1000
x <- as.vector(word_counts_df[word_counts_df$counts>threshold,]$word)
not_remove_list = c("bitch","fuck","love","baby","nigga","feel", "girl", "shit")
top_stopwords <- setdiff(x, not_remove_list)
smart_stopwords <- stopwords("en", source = "smart")
my_eng_stopwords <- unique(append(smart_stopwords, top_stopwords))
my_spa_stopwords <- unique(text_cleaning(stopwords("es", source = "stopwords-iso"), language = "es"))
del_stopwords = function(txt, stopword_list){
remove_regex = paste("\\b(", paste(stopword_list, collapse = "|"),")\\W", sep="")
txt = gsub(remove_regex, " ", txt)
txt = gsub("\\W+\\b", " ", txt)
return(txt)
}
spa_lyrics$sin_stopwords <- del_stopwords(spa_lyrics$lyrics_cleaning, my_spa_stopwords)
en_lyrics$sin_stopwords <- del_stopwords(en_lyrics$lyrics_cleaning, my_eng_stopwords)
#Diccionario español
malas_palabras_1 <- read_csv("data/malas_palabras.txt",
col_names = FALSE)
-- Column specification ------------------------------------------------------------------------------------------
cols(
X1 = col_character()
)
malas_palabras_2 <- read_csv("data/malas_palabras_translate.txt",
col_names = FALSE)
-- Column specification ------------------------------------------------------------------------------------------
cols(
X1 = col_character()
)
malas_palabras_3 <- read_csv("data/malas_palabras_wiki.txt",
col_names = FALSE) %>%
select(X1)
-- Column specification ------------------------------------------------------------------------------------------
cols(
X1 = col_character(),
X2 = col_character(),
X3 = col_character(),
X4 = col_character(),
X5 = col_character(),
X6 = col_character()
)
52 parsing failures.
row col expected actual file
2 -- 6 columns 3 columns 'data/malas_palabras_wiki.txt'
3 -- 6 columns 2 columns 'data/malas_palabras_wiki.txt'
4 -- 6 columns 3 columns 'data/malas_palabras_wiki.txt'
5 -- 6 columns 4 columns 'data/malas_palabras_wiki.txt'
6 -- 6 columns 2 columns 'data/malas_palabras_wiki.txt'
... ... ......... ......... ..............................
See problems(...) for more details.
malas_palabras_4 <- read_csv("data/palabras_profanas_es.txt",
col_names = FALSE)
-- Column specification ------------------------------------------------------------------------------------------
cols(
X1 = col_character()
)
malas_palabras <- rbind(malas_palabras_1, malas_palabras_2,
malas_palabras_3, malas_palabras_4)
#Hacer unique para eliminar las repetidas
malas_palabras$limpias = text_cleaning(malas_palabras$X1, language="es")
#Genero lista de malas palabras
racist_words <- unique(tolower(lexicon::profanity_racist))
biglou <- read.csv("https://www.cs.cmu.edu/~biglou/resources/bad-words.txt", header=FALSE, col.names = c("words"))
####################################################################
####### Generación de la Matríz Término-Documento del corpus #######
####################################################################
#función
corpus.pro2tdm <- function(corpus, ponderacion, n_terms){
#corpus
#matriz TD
dtm <- TermDocumentMatrix(corpus,
control = list(weighting = ponderacion))
matriz_td <- as.matrix(dtm)
# Calculamos la frecuencia de cada término en el corpus
freq_term <- head(sort(rowSums(matriz_td),decreasing=TRUE), n_terms)
#matriz transpuesta de los n_terms mas frecuentes
matriz_nf <- t(matriz_td[sort(names(freq_term)), ])
#pasaje a binario
matriz_nf[matriz_nf>0] <- 1
return(matriz_nf)
}
# ingles
corpus_eng = Corpus(VectorSource(enc2utf8(en_lyrics$sin_stopwords)))
matriz <- corpus.pro2tdm(corpus = corpus_eng, ponderacion= "weightTf",n_terms= 150)
df_tm <- as.data.frame(matriz)
#español
corpus_esp = Corpus(VectorSource(enc2utf8(spa_lyrics$sin_stopwords)))
matriz_esp <- corpus.pro2tdm(corpus = corpus_esp, ponderacion= "weightTf",n_terms= 150)
df_tm_esp <- as.data.frame(matriz_esp)
## Join matriz de palabras con artista y track
df_ly_feat <- cbind(en_lyrics[, 1:19], df_tm)
nrow(df_tm)
[1] 1029
nrow(en_lyrics) # 1055 instancias
[1] 1029
nrow(df_ly_feat) #joinean 869
[1] 1029
na.omit(df_ly_feat)
mice::md.pattern(df_ly_feat[, 153:ncol(df_ly_feat)], rotate.names = T)
/\ /\
{ `---' }
{ O O }
==> V <== No need for mice. This data set is completely observed.
\ \|/ /
`-----'
times told tonight top touch tryna turn wait waiting walk watch woah woo work world wrong young
1029 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
filter <- !names(df_ly_feat) %in% c("artist_name", "track_name" )
df_ly_feat_ok <- df_ly_feat[, filter]
# df_ly_feat_ok = df_ly_feat_ok[, -(which(colSums(df_ly_feat_ok) == 0))]
# colSums(df_ly_feat_ok)
head(df_ly_feat_ok, 3)
head(df_ly_feat, 3)
df_ly_feat$id = 1:nrow(df_ly_feat)
df_melt <- reshape2::melt(data = df_ly_feat[,4:ncol(df_ly_feat)], id.vars = c("id")) %>%
arrange(id)
attributes are not identical across measure variables; they will be dropped
df_melt <- df_melt[df_melt$value != 0,]
df_melt_txt <- df_melt[df_melt$value == 1,]
df_melt_cat <- df_melt[df_melt$value != 1,]
df_melt_cat$variable = paste0(df_melt_cat$variable, "=", as.character(df_melt_cat$value))
head(df_melt_txt )
dim(df_melt_txt )
[1] 22089 3
#denomino a los términos profanos
df_melt_txt <- df_melt_txt %>%
mutate(variable = case_when(as.character(variable) %in% biglou$words ~
paste0("PROFANE_", as.character(variable)),
as.character(variable) %in% racist_words ~
paste0("RACIST_", as.character(variable)),
T ~ paste0("TERM_", as.character(variable))
)
)
df_melt_txt %>% filter(startsWith(variable, "PROF"))
# df_melt_txt[df_melt_txt$variable %in% biglou$words,]
df_ly_feat_esp <- cbind(spa_lyrics[, 1:19], df_tm_esp)
nrow(spa_lyrics)
[1] 154
nrow(df_ly_feat_esp)
[1] 154
na.omit(df_ly_feat_esp)
mice::md.pattern(df_ly_feat_esp[, 153:ncol(df_ly_feat_esp)], rotate.names = T)
/\ /\
{ `---' }
{ O O }
==> V <== No need for mice. This data set is completely observed.
\ \|/ /
`-----'
uah uoh vamo vas ven veo verte vida viene vuelvo woh work wuh yah yeah yeh you
154 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
df_ly_feat_ok_esp <- df_ly_feat_esp[, filter]
df_ly_feat_ok_esp$id = 1:nrow(df_ly_feat_ok_esp)
df_melt_esp <- reshape2::melt(data = df_ly_feat_ok_esp[,4:ncol(df_ly_feat_ok_esp)], id.vars = c("id")) %>%
arrange(id)
attributes are not identical across measure variables; they will be dropped
df_melt_esp <- df_melt_esp[df_melt_esp$value != 0,]
df_melt_esp_txt <- df_melt_esp[df_melt_esp$value == 1,]
df_melt_esp_cat <- df_melt_esp[df_melt_esp$value != 1,]
df_melt_esp_cat$variable = paste0(df_melt_esp_cat$variable, "=", as.character(df_melt_esp_cat$value))
#denomino a los términos profanos
df_melt_esp_txt <- df_melt_esp_txt %>%
mutate(variable = case_when(as.character(variable) %in% malas_palabras$limpias ~
paste0("PROFANE_", as.character(variable)),
T ~ paste0("TERM_", as.character(variable))
)
)
df_melt_txt_to_ruls_esp <- rbind(df_melt_esp_txt, df_melt_esp_cat)
df_melt_txt_to_ruls_esp <- na.omit(df_melt_txt_to_ruls_esp[,-c(3)])
names(df_melt_txt_to_ruls_esp ) = c("TID", "item")
df_melt_txt_to_ruls <- rbind(df_melt_txt, df_melt_cat)
df_melt_txt_to_ruls <- na.omit(df_melt_txt_to_ruls[,-c(3)])
names(df_melt_txt_to_ruls ) = c("TID", "item")
write.table(df_melt_txt_to_ruls, file="data/transaccions_lyrics_features.txt", row.names = F)
# Reglas
# chequear nan's
lyrics_trans <- read.transactions("data/transaccions_lyrics_features.txt", format = "single", cols = c(1,2))
arules::inspect(head(lyrics_trans, 3))
items transactionID
[1] {cant_markets_cat=Media,
danceability_cat=Muy alta,
duration_ms_cat=Alta,
explicit_cat=No,
instrumentalness_cat=Baja,
liveness_cat=Muy alta,
loudness_cat=Alta,
lyrics=
Bigger Than You Lyrics
[Intro: Quavo & 2 Chainz]2 Chainz (Murda on the beat so it's not nice)
Yeah, VIP my squad (yeah), drop off all the gang (yeah)
VIP the lane (skrrt), VIP the chain (squad)
VIP my squad (yeah, squad), drop off all the gang (yeah)
VIP the lane (skrrt, yeah), VIP the chain[Chorus: Quavo & 2 Chainz]Uh, yeah, baller alert
Lil biddy bitch I don't call her alert (brrrrp)
Uh, yeah, follow alert
Go get the fire when I'm callin' 'em merked
Uh, yeah, profit come first
I whip the baby, the baby gon' burp (whip it)Uh, yeah, I bought a 'Claren (woo)
I bought a 'Claren, didn't wanna buy verts, uh
This shit bigger than you (hey), I'm takin' on a new path (uh)
Making them bricks take a bath (woo)
Lil biddy bitch, do the math (yeah)
Lil nigga, who are you? (who you, yeah)
Must be bulletproof (brrrp)
This shit bigger than you (it big)
This shit bigger than you
[Verse 1: 2 Chainz]Chain so big, should have came with a kick standFuck with me, I got a retainer on a hit man (bop)Barely came up out the mud like quicksand (barely)I show you how to get millions, nigga, that's a mil plan, nowUh, yeah, ring the alarm, Cartier bracelets on all of my armsUh, yeah, halo my son, in the wheelchair, and I still perform (uh)
I don't make excuses, you know that I'm hungry, I still got the juice (uh)You set it off like Cleo, I set it off like Boosie[Chorus: Quavo & 2 Chainz]Uh, yeah, baller alert
Lil biddy bitch I don't call her alert (brrrrp)
Uh, yeah, follow alert
Go get the fire when I'm callin' 'em merked
Uh, yeah, profit come first
I whip the baby, the baby gon' burp (whip it)Uh, yeah, I bought a 'Claren (woo)
I bought a 'Claren, didn't wanna buy verts, uh
This shit bigger than you (hey)
I'm takin' on a new path (uh)
Making the bricks take a bath (woo)
Lil biddy bitch, do the math (yeah)
Lil nigga, who are you? (who you?)
Must be bulletproof (brrrp) (bye)
This shit bigger than you (it big)
This shit bigger than you
[Verse 2: Drake & Quavo]Young champagne checkin' in, man, Tity Boi shit ringin' off'Member I was on pre-paid, I would act like my shit was ringin' off
'Member shorty told me she thought the raps good but the singing's offWatch on Young Dro now, man, boi-oing shit blingin' off
Where the racks at? (racks)
All I know is they keep comin' to me like a flashback, nigga, what? (what?)Half a million out in Vegas, it ain't no blackjack, nigga, naw (naw)Quavo Sinatra, but we could never be the Rat Pack, nigga, naw[Chorus: Quavo & 2 Chainz]Uh, yeah, baller alert
Lil biddy bitch I don't call her alert (brrrrp)
Uh, yeah, follow alert
Go get the fire when I'm callin' 'em merked
Uh, yeah, profit come first
I whip the baby, the baby gon' birth (whip it)Uh, yeah, I bought a 'Claren (woo)
I bought a 'Claren, didn't wanna buy verts, uh
This shit bigger than you (hey)
I'm takin' on a new path (uh)
Making the bricks take a bath (woo)
Lil biddy bitch, do the math (yeah, tell 'em)
Lil nigga, who are you? (who you?)
Must be bulletproof (brrrp) (bye)
This shit bigger than you (it big)
This shit bigger than you
[Verse 3: 2 Chainz]Touchscreen on my cars, vintage one to twoI just bought a watch that's plain like a Dickie suit (plain)I sip some red wine, and chased it with the '42
They asked me what I call millions, comin' soon
Yeah (yeah), I just cashed out (uh)
Ain't got time for a beef, I'm a cash cow (tell 'em)When I was in juvie, I made 'em back out (back that ass up)
It was 400 degrees, you would have passed out (ooh)[Chorus: Quavo & 2 Chainz]Uh, yeah, baller alert
Lil biddy bitch I don't call her alert (brrrrp)
Uh, yeah, follow alert
Go get the fire when I'm callin' 'em merked (yeaaaah)
Uh, yeah, profit come first
I whip the baby, the baby gon' birth (whip it)Uh, yeah, I bought a 'Claren (woo)
I bought a 'Claren, didn't wanna buy verts, uh
This shit bigger than you (hey)
I'm takin' on a new path (uh)
Making the bricks take a bath (woo)
Lil biddy bitch, do the math (yeah)
Lil nigga, who are you? (who you?)
Must be bulletproof (brrrp) (bye)
This shit bigger than you (it big)
This shit bigger than you[Outro: Quavo & 2 Chainz]
VIP my squad (yeah), drop off all the gang (yeah)
VIP the lane (skrrt), VIP the chain (squad)
VIP my squad (yeah) (yeah), drop off all the gang (yeah)
VIP the lane (skrrt), VIP the chain
More on Genius
Murda Beatz Gave Travis Scott’s “BUTTERFLY EFFECT” Beat To Nicki Minaj & Quavo First
Quavo Repurposes His Unreleased “Butterfly Effect (Remix)” Verse On 2 Chainz & Drake’s “Bigger Than You”
,
mode_name_cat=major,
popularidad_cat=Alta,
position_median_cat=Baja,
PROFANE_ass,
PROFANE_bitch,
PROFANE_fire,
PROFANE_fuck,
PROFANE_nigga,
PROFANE_shit,
semanas_sum_cat=Alta,
speechiness_cat=Muy alta,
streams_avg_cat=Media,
tempo_cat=Muy alta,
TERM_baby,
TERM_big,
TERM_call,
TERM_coming,
TERM_didn,
TERM_drop,
TERM_gang,
TERM_gon,
TERM_good,
TERM_hey,
TERM_hit,
TERM_lil,
TERM_made,
TERM_man,
TERM_ring,
TERM_show,
TERM_skrrt,
TERM_thought,
TERM_told,
TERM_watch,
TERM_woo,
TERM_young,
valence_cat=Baja} 1
[2] {lyrics=
CITY OF ANGELS Lyrics
[Chorus]I sold my soul to the devil for designer
They said, "Go to hell," but I told 'em I don’t wanna
If you know me well, then you know that I ain't goin', ’causeI don't wanna, I don't wanna, I don't wanna die young
The city of angels where I have my funDon't wanna die young
When I'm gone, remember all I've done, 'one
[Verse]
We’ve had our fun, ’un
But now I'm done, ’one
'Cause you crazy (Yeah), I can't take it (No)
Just wanted to see you naked
Heard time like money, can't waste it
What’s the price of fame? 'Cause I can taste it
So I'm chasin' (Yeah), and I'm facin'
A little Hennessy, it might be good for me
[Chorus]
I sold my soul to the devil for designer
They said, "Go to hell," but I told 'em I don't wanna
If you know me well, then you know that I ain't goin', 'causeI don't wanna, I don't wanna, I don't wanna die young
The city of angels where I have my funDon't wanna die young
When I'm gone, remember all I've done, 'one
More on Genius
24kGoldn Breaks Down The Meaning Of "City Of Angels"
,
popularidad_cat=Muy alta,
position_median_cat=Muy alta,
PROFANE_die,
semanas_sum_cat=Muy alta,
streams_avg_cat=Alta,
TERM_crazy,
TERM_doe,
TERM_good,
TERM_money,
TERM_told,
TERM_young} 10
[3] {cant_markets_cat=Baja,
danceability_cat=Media,
duration_ms_cat=Alta,
explicit_cat=No,
instrumentalness_cat=Alta,
liveness_cat=Muy alta,
loudness_cat=Alta,
lyrics=
Waiting for Love Lyrics
[Verse 1]Where there's a will, there's a way, kinda beautiful
And every night has its day, so magical
And if there's love in this life, there's no obstacle
That can't be defeatedFor every tyrant a tear for the vulnerableIn every lost soul the bones of a miracleFor every dreamer a dream we're unstoppableWith something to believe in
[Chorus]Monday left me broken
Tuesday I was through with hopin'
Wednesday my empty arms were open
Thursday waiting for love, waiting for love
Thank the stars it's Friday
I'm burning like a fire gone wild on Saturday
Guess I won't be coming to church on Sunday
I'll be waiting for love
Waiting for love to come around
[Instrumental Break]
[Verse 2]
We are one of a kind, irreplaceable
How did I get so blind and so cynical?
If there's love in this life we're unstoppable
No, we can't be defeated
[Chorus]Monday left me broken
Tuesday I was through with hopin'
Wednesday my empty arms were open
Thursday waiting for love, waiting for love
Thank the stars it's Friday
I'm burning like a fire gone wild on Saturday
Guess I won't be coming to church on Sunday
I'll be waiting for love
Waiting for love to come around
(Co-co-come around)
More on Genius
,
mode_name_cat=minor,
popularidad_cat=Alta,
position_median_cat=Alta,
PROFANE_fire,
semanas_sum_cat=Media,
speechiness_cat=Media,
streams_avg_cat=Muy alta,
tempo_cat=Alta,
TERM_coming,
TERM_day,
TERM_guess,
TERM_left,
TERM_life,
TERM_lost,
TERM_love,
TERM_night,
TERM_waiting,
valence_cat=Alta} 100
summary(lyrics_trans)
transactions as itemMatrix in sparse format with
1030 rows (elements/itemsets/transactions) and
1231 columns (items) and a density of 0.02728384
most frequent items:
cant_markets_cat=Alta instrumentalness_cat=Baja loudness_cat=Alta mode_name_cat=major
550 546 532 506
TERM_love (Other)
497 31963
element (itemset/transaction) length distribution:
sizes
1 5 7 9 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
1 1 1 1 5 8 3 5 6 6 5 19 21 14 21 14 21 34 36 46 47 49 57 49 43 51 27 30 35 36 29 22 23 24 16 21 13 17
45 46 47 48 49 50 51 52 53 54 55 56 57 58 60 61 62 63 64 65 66 67 68 71 72 78
12 13 10 16 17 8 12 14 12 11 5 9 3 2 4 7 2 2 3 3 2 2 1 1 1 1
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 26.00 32.00 33.59 40.00 78.00
includes extended item information - examples:
includes extended transaction information - examples:
reglas <- apriori(lyrics_trans, parameter = list(support=0.01,
confidence = 0.1, target = "rules" ))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 10
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[1231 item(s), 1030 transaction(s)] done [0.02s].
sorting and recoding items ... [199 item(s)] done [0.01s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6
Mining stopped (time limit reached). Only patterns up to a length of 6 returned!
done [5.27s].
writing ... [13561421 rule(s)] done [70.26s].
creating S4 object ... done [35.60s].
reglas_sub <- subset(reglas, subset = lhs %pin% "PROF_")
arules::inspect(head(sort(reglas_sub, by = "lift", decreasing = T),5))
df_lyrics_unicas <- df_lyrics %>% distinct(artist_name, track_name, lyrics)
nrow(df_lyrics_unicas)
[1] 1362
df_chart_w_lyrics <- merge(join_audio_charts, df_lyrics_unicas, by.x = c("artist_name","track_name"), by.y= c("artist_name","track_name"), all.x=TRUE, all.y = FALSE)
df_chart_w_lyrics <- df_chart_w_lyrics[!is.na(df_chart_w_lyrics$lyrics),]
bad_words <- c()
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_zac_anger)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_alvarez)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_arr_bad)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_racist)))
bad_words <- append(bad_words, unique(tolower(lexicon::profanity_banned)))
bad_words <- unique(bad_words)
contar_bad_words <- function(x){
x <- profanity(x,profanity_list = bad_words)
q <- sum(x$profanity_count)
return (q)
}
df_chart_w_lyrics$cant_bad_words <- sapply(df_chart_w_lyrics[,"lyrics"], contar_bad_words)
df_chart_w_lyrics_only_explicit <- df_chart_w_lyrics[df_chart_w_lyrics$explicit==TRUE & df_chart_w_lyrics$cant_bad_words > 0, ]
hist(df_chart_w_lyrics_only_explicit$cant_bad_words)
#creo vars categóricas
df_chart_w_lyrics_only_explicit$nivel_puteada <- cut(df_chart_w_lyrics_only_explicit$cant_bad_words, breaks = c(0,10,20,50,Inf), labels=c("bajo","poco","alto","muy_alto"))
df_chart_w_lyrics_only_explicit$nivel_ranking <- cut(df_chart_w_lyrics_only_explicit$position_avg, breaks = c(1,100,Inf), labels=c("1a100","100a200"))
df_chart_w_lyrics_only_explicit$nivel_popularidad <- cut(sqrt(df_chart_w_lyrics_only_explicit$cant_bad_words), breaks = c(0,10,20,50,Inf), labels=c("bajo","poco","alto","muy_alto"))
transactions <- as(as.data.frame(apply(df_chart_w_lyrics_only_explicit, 2, as.factor)), "transactions")
Column(s) 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37 not logical or factor. Applying default discretization (see '? discretizeDF').
rules = apriori(transactions, parameter=list(target="rules", confidence=0.25, support=0.1))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 83
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[10525 item(s), 838 transaction(s)] done [0.02s].
sorting and recoding items ... [23 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 6 7 done [0.00s].
writing ... [1576 rule(s)] done [0.01s].
creating S4 object ... done [0.00s].
rules.sub <- subset(rules, subset = lhs %pin% "nivel_puteada" & rhs %pin% "nivel_ranking")
inspect(head(sort(rules.sub, by = "lift", decreasing = TRUE),10))
Error in UseMethod("inspect", x) :
no applicable method for 'inspect' applied to an object of class "c('rules', 'associations')"